library(tidyverse)
library(magrittr)
library(ngsReports)
library(here)
library(scales)
library(ggpubr)
library(kableExtra)
library(AnnotationHub)
library(ensembldb)
library(edgeR)
library(corrplot)
library(DT)
library(pander)
library(ggrepel)
library(pheatmap)
library(ggdendro)
if (interactive()) setwd(here::here())
theme_set(theme_bw())
ah_Dr <- AnnotationHub() %>%
subset(species == "Danio rerio") %>%
subset(rdataclass == "EnsDb")
ensDb_Dr <- ah_Dr[["AH83189"]]
trEns_Dr <- transcripts(ensDb_Dr) %>%
mcols() %>%
as_tibble()
trLen_Dr <- exonsBy(ensDb_Dr, "tx") %>%
width() %>%
vapply(sum, integer(1))
geneGcLen_Dr <- trLen_Dr %>%
enframe() %>%
set_colnames(c("tx_id", "length")) %>%
left_join(trEns_Dr) %>%
group_by(gene_id) %>%
summarise(
aveLen = mean(length),
maxLen = max(length),
aveGc = sum(length * gc_content) / sum(length),
longestGc = gc_content[which.max(length)[[1]]]
) %>%
mutate(
aveGc = aveGc / 100,
longestGc = longestGc / 100
)
trGcLen_Dr <- trLen_Dr %>%
enframe() %>%
set_colnames(c("tx_id", "length")) %>%
left_join(trEns_Dr) %>%
group_by(tx_id) %>%
summarise(
aveLen = mean(length),
maxLen = max(length),
aveGc = sum(length * gc_content) / sum(length),
longestGc = gc_content[which.max(length)[[1]]]
) %>%
mutate(
aveGc = aveGc / 100,
longestGc = longestGc / 100
)
genesGR_Dr <- genes(ensDb_Dr)
mcols(genesGR_Dr) <- mcols(genesGR_Dr)[c("gene_id", "gene_name",
"gene_biotype", "entrezid")]
txGR_Dr <- transcripts(ensDb_Dr)
mcols(txGR_Dr) <- mcols(txGR_Dr)[c("tx_id", "tx_name",
"tx_biotype", "tx_id_version", "gene_id")]
An EnsDb object was obtained for Ensembl release 101 using the AnnotationHub package. This provided the GC content and length for every gene and transcript in the release. For zebrafish, this consists of 37241 genes and 65905 transcripts.
This is a total RNA-seq dataset generated from a 3-way comparison of WT zebrafish (Danio rerio) with heterozygous mutants (psen2S4Ter/+) and homozygous mutants (psen2S4Ter/S4Ter). A previous analysis of this dataset identified the possibility of incomplete ribosomal RNA (rRNA) removal. The following analysis involves an investigation into possible reasons for incomplete rRNA removal and any bias this introduces into the data.
files <- list.files(
path = "/hpcfs/users/a1647910/20200902_Psen2S4Ter/0_rawData/FastQC",
pattern = "zip",
full.names = TRUE
)
samples <- tibble(
sample = str_remove(basename(files), "_fastqc.zip"),
dataset = NA,
organism = NA
) %>%
mutate(
dataset = ifelse(
str_detect(sample, "Ps2Ex"), "Psen2S4Ter", dataset
),
organism = ifelse(
str_detect(sample, "Ps2Ex"), "zebrafish", organism
)
)
datasets <- samples$dataset %>%
unique()
The following analysis involves 12 paired-end samples across 1 dataset(s): Psen2S4Ter.
rawFqc <- files %>%
FastqcDataList()
data <- grep("Ps2Ex", fqName(rawFqc))
labels <- rawFqc[data] %>%
fqName() %>%
str_remove("_6month_07_07_2016_F3") %>%
str_remove("\\.fastq\\.gz") %>%
str_remove("Ps2Ex3M1_")
rawLib <- plotReadTotals(rawFqc[data]) +
labs(subtitle = "Psen2S4Ter") +
scale_x_discrete(labels = labels)
The library sizes of the unprocessed dataset(s) range between 27,979,654 and 37,144,975 reads.
rawLib
rRNA transcripts are known to have high GC content. Therefore, inspecting the GC content of the raw reads is a logical start point for detecting incomplete rRNA removal. A spike in GC content at > 70% is expected if this is the case.
plotly::ggplotly(
plotGcContent(
x = rawFqc[data],
plotType = "line",
gcType = "Transcriptome",
species = "Drerio"
) +
labs(title = "Psen2S4Ter Dataset (D. rerio)") +
theme(legend.position="none")
)
GC content of reads in the dataset. Clear spikes above 70% GC are observed, which is likely due to incomplete rRNA depletion.
The top 30 overrepresented sequences were analysed using blastn and were found to be predominantly rRNA sequences.
getModule(rawFqc, "Overrep") %>%
group_by(Sequence, Possible_Source) %>%
summarise(`Found In` = n(), `Highest Percentage` = max(Percentage)) %>%
arrange(desc(`Highest Percentage`), desc(`Found In`)) %>%
ungroup() %>%
dplyr::slice(1:30) %>%
mutate(`Highest Percentage` = percent_format(0.01)(`Highest Percentage`/100)) %>%
kable(
align = "llrr",
caption = paste(
"Top", nrow(.),"Overrepresented sequences.",
"The number of samples they were found in is shown,",
"along with the percentage of the most 'contaminated' sample."
)
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "responsive")
)
| Sequence | Possible_Source | Found In | Highest Percentage |
|---|---|---|---|
| GTGGGTTCAGGTAATTAATTTAAAGCTACTTTCGTGTTTGGGCCTCTAGC | No Hit | 12 | 1.72% |
| CTGGGGGAGCGGCCGCCCCGCGGCGCCCCCTCTCGTTCCCGTCTCCGGAG | No Hit | 10 | 1.69% |
| CCGCTGTATTACTCAGGCTGCACTGCAGTGTCTATTCACAGGCGCGATCC | No Hit | 12 | 1.32% |
| GGCCCGGCGCACGTCCAGAGTCGCCGCCGCACACCGCAGCGCATCCCCCC | No Hit | 9 | 1.31% |
| CTCCTGAAAAGGTTGTATCCTTTGTTAAAGGGGCTGTACCCTCTTTAACT | No Hit | 11 | 1.11% |
| GGTTCAGGTAATTAATTTAAAGCTACTTTCGTGTTTGGGCCTCTAGCATC | No Hit | 12 | 1.09% |
| GGGGTGTACGAAGCTGAACTTTTATTCATCTCCCAGACAACCAGCTATTG | No Hit | 12 | 1.07% |
| GGCCCGGCGCACGTCCAGAGTCGCCGCCGCGCACCGCAGCGCATCCCCCC | No Hit | 10 | 1.03% |
| CGAGAGGCTCTAGTTGATATACTACGGCGTAAAGGGTGGTTAAGGAACAA | No Hit | 12 | 1.01% |
| GGGGGAGCGGCCGCCCCGCGGCGCCCCCTCTCGTTCCCGTCTCCGGAGCG | No Hit | 9 | 0.87% |
| CCTCCTTCAAGTATTGTTTCATGTTACATTTTCGTATATTCTGGGGTAGA | No Hit | 12 | 0.82% |
| CCCGCTGTATTACTCAGGCTGCACTGCAGTGTCTATTCACAGGCGCGATC | No Hit | 12 | 0.79% |
| GTTCAGGTAATTAATTTAAAGCTACTTTCGTGTTTGGGCCTCTAGCATCT | No Hit | 12 | 0.79% |
| GGGTTCAGGTAATTAATTTAAAGCTACTTTCGTGTTTGGGCCTCTAGCAT | No Hit | 12 | 0.78% |
| CGGGTCGGGTGGGTGGCCGGCATCACCGCGGACCTCGGGCGCCCTTTTGG | No Hit | 12 | 0.73% |
| GGGCCTCTAGCATCTAAAAGCGTATAACAGTTAAAGGGCCGTTTGGCTTT | No Hit | 11 | 0.68% |
| CAGTGGCGTGCGCCTGTAATCCAAGCTACTGGGAGGCTGAGGCTGGCGGA | No Hit | 11 | 0.64% |
| CGGGTCGGGTGGGTAGCCGGCATCACCGCGGACCTCGGGCGCCCTTTTGG | No Hit | 12 | 0.57% |
| CTTAGACGACCTGGTAGTCCAAGGCTCCCCCAGGAGCACCATATCGATAC | No Hit | 11 | 0.54% |
| AGCTGGGGAGATCCGCGAGAAGGGCCCGGCGCACGTCCAGAGTCGCCGCC | No Hit | 11 | 0.53% |
| GGCCTCTAGCATCTAAAAGCGTATAACAGTTAAAGGGCCGTTTGGCTTTA | No Hit | 10 | 0.53% |
| CAGCCTATTTAACTTAGGGCCAACCCGTCTCTGTGGCAATAGAGTGGGAA | No Hit | 12 | 0.51% |
| GGGTGGGTGGCCGGCATCACCGCGGACCTCGGGCGCCCTTTTGGACGTGG | No Hit | 10 | 0.50% |
| GGGAGCGGCCGCCCCGCGGCGCCCCCTCTCGTTCCCGTCTCCGGAGCGCG | No Hit | 9 | 0.49% |
| CTGGGAGATGAATAAAAGTTCAGCTTCGTACACCCCAAATTAAAAAATTA | No Hit | 10 | 0.48% |
| GCCTATTTAACTTAGGGCCAACCCGTCTCTGTGGCAATAGAGTGGGAAGA | No Hit | 12 | 0.47% |
| GGTCGGGTGGGTGGCCGGCATCACCGCGGACCTCGGGCGCCCTTTTGGAC | No Hit | 11 | 0.45% |
| CCCCCGAACCCTTCCAAGCCGAACCGGAGCCGGTCGCGGCGCACCGCCGA | No Hit | 10 | 0.45% |
| GTCGGGTGGGTGGCCGGCATCACCGCGGACCTCGGGCGCCCTTTTGGACG | No Hit | 10 | 0.43% |
| GCCCACTACGACAACGTGTTTTGTAAATTATGATCTTTATTCTCCTGAAA | No Hit | 10 | 0.43% |
Raw libraries were trimmed using cutadapt v1.14 to remove Illumina adapter sequences. Bases with PHRED score < 30, NextSeq-induced polyG runs and reads shorter than 35bp were also removed.
trimFqc <- list.files(
path = "/hpcfs/users/a1647910/20200902_Psen2S4Ter/1_trimmedData/FastQC",
pattern = "zip",
full.names = TRUE
) %>%
FastqcDataList()
trimStats <- readTotals(rawFqc) %>%
dplyr::rename(Raw = Total_Sequences) %>%
left_join(readTotals(trimFqc), by = "Filename") %>%
dplyr::rename(Trimmed = Total_Sequences) %>%
mutate(
Discarded = 1 - Trimmed/Raw,
Retained = Trimmed / Raw
)
After trimming of adapters between 4.16% and 5.08% of reads were discarded.
Trimmed reads were:
Aligned to rRNA sequences using the BWA-MEM algorithm to estimate the proportion of reads that were of rRNA origin within each sample. BWA-MEM is recommended for high-quality queries of reads ranging from 70bp to 1Mbp as it is faster and more accurate that alternative algorithms BWA-backtrack and BWA-SW.
Aligned to the Danio rerio GRCz11 genome (Ensembl release 101) using STAR v2.7.0d and summarised with featureCounts from the Subread v1.5.2 package. These counts were used for all gene-level analysis.
rRnaProp <- read.delim(
"/hpcfs/users/a1647910/20200902_Psen2S4Ter/3_bwa/log/samples.mapped.all",
sep = ":",
col.names = c("sample", "proportion"),
header = FALSE
) %>%
mutate(
sample = str_remove_all(sample, "_6month_F3|[0-9]*_Ps2Ex3M1_|.mapped"),
sample = basename(sample),
proportion = proportion/100,
dataset = "Psen2S4Ter",
organism = "zebrafish",
group = str_extract(sample, "(WT|Heter|Hom)")
) %>%
as_tibble()
rRnaProp$dataset %<>%
factor(levels = c("Psen2S4Ter"))
rRnaProp %>%
ggplot(aes(sample, proportion)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~dataset, scales = "free_x") +
scale_y_continuous(labels = percent) +
labs(x = "Sample", y = "Percent of Total", fill = "Read pair") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Percentages of each library that align to rRNA sequences with bwa mem.
rRnaProp %>%
ggplot(aes(group, proportion, fill = group)) +
geom_boxplot() +
scale_y_continuous(labels = percent) +
labs(x = "Genotype", y = "Percent of total RNA", title = "rRNA proportions of each genotype") +
scale_fill_discrete(
name = "Genotype"
)
dgeList <- read_tsv("/hpcfs/users/a1647910/20200902_Psen2S4Ter/2_alignedData/featureCounts/genes.out") %>%
set_colnames(basename(colnames(.))) %>%
set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam")) %>%
set_colnames(str_remove(colnames(.), "_6month_F3")) %>%
set_colnames(str_remove(colnames(.), "[0-9]*_Ps2Ex3M1_")) %>%
as.data.frame() %>%
column_to_rownames("Geneid") %>%
DGEList() %>%
calcNormFactors()
dgeList$genes <- genesGR_Dr[rownames(dgeList),]
mcols(dgeList$genes) %<>%
as.data.frame() %>%
left_join(geneGcLen_Dr)
addInfo <- tibble(
sample = rRnaProp$sample,
dataset = "Psen2S4Ter",
organism = "zebrafish",
rRNA = rRnaProp$proportion
)
dgeList$samples %<>%
rownames_to_column("rowname") %>%
mutate(sample = rowname) %>%
left_join(addInfo) %>%
column_to_rownames("rowname")
dgeList$samples$filenames <- read_tsv(
"/hpcfs/users/a1647910/20200902_Psen2S4Ter/2_alignedData/featureCounts/genes.out"
) %>%
dplyr::select(str_subset(colnames(.), "Ps2Ex3M1_")) %>%
colnames()
dgeList$samples$group <- colnames(dgeList) %>%
str_extract("(WT|Heter|Hom)") %>%
factor(levels = c("WT", "Heter", "Hom"))
gcInfo <- function(x) {
x$counts %>%
as.data.frame() %>%
rownames_to_column("gene_id") %>%
as_tibble() %>%
pivot_longer(
cols = colnames(x),
names_to = "sample",
values_to = "counts"
) %>%
dplyr::filter(
counts > 0
) %>%
left_join(
geneGcLen_Dr
) %>%
dplyr::select(
ends_with("id"), sample, counts, aveGc, maxLen
) %>%
split(f = .$sample) %>%
lapply(
function(x){
DataFrame(
gc = Rle(x$aveGc, x$counts),
logLen = Rle(log10(x$maxLen), x$counts)
)
}
)
}
gcSummary <- function(x) {
x %>%
vapply(function(x){
c(mean(x$gc), sd(x$gc), mean(x$logLen), sd(x$logLen))
}, numeric(4)
) %>%
t() %>%
set_colnames(
c("mn_gc", "sd_gc", "mn_logLen", "sd_logLen")
) %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble()
}
rle <- gcInfo(dgeList)
sumGc <- gcSummary(rle)
a <- sumGc %>%
left_join(dgeList$samples) %>%
ggplot(aes(rRNA, mn_logLen)) +
geom_point(aes(colour = group), size = 3) +
geom_smooth(method = "lm") +
scale_x_continuous(labels = percent) +
labs(
x = "rRNA Proportion of Initial Library",
y = "Mean log(Length)",
colour = "Genotype"
)
b <- sumGc %>%
left_join(dgeList$samples) %>%
ggplot(aes(rRNA, mn_gc)) +
geom_point(aes(colour = group), size = 3) +
geom_smooth(method = "lm") +
scale_y_continuous(labels = percent) +
scale_x_continuous(labels = percent) +
labs(
x = "rRNA Proportion of Initial Library",
y = "Mean GC Content",
colour = "Genotype"
)
ggarrange(
a, b, ncol = 2, nrow = 1,
common.legend = TRUE, legend = "bottom"
) %>%
annotate_figure("PsenS4Ter Dataset (D. rerio)")
Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.
genes2keep <- dgeList %>%
cpm() %>%
is_greater_than(1) %>%
rowSums() %>%
is_weakly_greater_than(6)
dgeFilt <- dgeList[genes2keep,, keep.lib.sizes = FALSE] %>%
calcNormFactors()
pca <- cpm(dgeFilt, log = TRUE) %>%
t() %>%
prcomp()
pcaCor <- pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(sumGc) %>%
as_tibble() %>%
left_join(dgeList$samples) %>%
dplyr::select(
PC1, PC2, PC3,
Mean_GC = mn_gc,
Mean_Length = mn_logLen,
rRna_Proportion = rRNA
) %>%
cor()
a <- pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(dgeList$samples) %>%
as_tibble() %>%
ggplot(aes(PC1, PC2)) +
geom_point(aes(colour = group), size = 2) +
labs(
x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
y = paste0("PC2 (", percent(summary(pca)$importance["Proportion of Variance","PC2"]),")"),
colour = "Genotype"
)
b <- pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(dgeList$samples) %>%
ggplot(aes(PC1, rRNA)) +
geom_point(aes(colour = group), size = 2) +
geom_smooth(method = "lm") +
scale_y_continuous(labels = percent) +
labs(
x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
y = "rRNA Proportion of Initial Library",
colour = "Genotype"
)
c <- pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(sumGc) %>%
left_join(dgeList$samples) %>%
as_tibble() %>%
ggplot(aes(PC1, mn_logLen)) +
geom_point(aes(colour = group), size = 2) +
geom_smooth(method = "lm") +
labs(
x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
y = "Mean log(Length)",
colour = "Genotype"
)
d <- pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(sumGc) %>%
left_join(dgeList$samples) %>%
as_tibble() %>%
ggplot(aes(PC1, mn_gc)) +
geom_point(aes(colour = group), size = 2) +
geom_smooth(method = "lm") +
scale_y_continuous(labels = percent) +
labs(
x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
y = "Mean GC",
colour = "Genotype"
)
ggarrange(
a, b, c, d, ncol = 2, nrow = 2,
common.legend = TRUE, legend = "bottom"
) %>%
annotate_figure("Psen2S4Ter")
PCA plot showing rRNA proportion, mean GC content and mean log(length) after summarisation to gene-level.
corrplot(
pcaCor,
type = "lower",
diag = FALSE,
addCoef.col = 1, addCoefasPercent = TRUE
)
Correlations between the first three principal components and measured variables: mean GC content, mean log(length) and rRNA proportion.
design <- model.matrix(~rRNA, data = dgeFilt$samples)
voom <- voom(dgeFilt, design = design)
fit <- lmFit(voom, design = design)
eBayes <- eBayes(fit)
topTable <- eBayes %>%
topTable(coef = colnames(design)[2], sort.by = "p", n = Inf) %>%
set_colnames(str_remove(colnames(.), "ID\\.")) %>%
mutate(DE = adj.P.Val < 0.05) %>%
unite(Location, c(seqnames, start, end, width, strand), sep = ":") %>%
dplyr::select(
Geneid = gene_id,
Symbol = gene_name,
AveExpr,
logFC,
P.Value,
FDR = adj.P.Val,
Location,
t,
DE,
everything(),
-B
) %>%
as_tibble()
topTable %>%
dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR, DE) %>%
mutate(
AveExpr = format(round(AveExpr, 2), nsmall = 2),
logFC = format(round(logFC, 2), nsmall = 2),
P.Value = sprintf("%.2e", P.Value),
FDR = sprintf("%.2e", FDR)
) %>%
dplyr::slice(1:200) %>%
datatable(
options = list(pageLength = 20),
class = "striped hover condensed responsive",
filter = "top",
caption = paste(
"The top 100 differentially expressed genes.",
nrow(dplyr::filter(topTable, DE)),
"of",
nrow(topTable),
"genes were classified as DE with an FDR < 0.05."
)
)
## Calculate GC proportion of kmer
gcKmer <- function(x){
x %>%
BString() %>%
letterFrequency(letters = "GC", as.prob = TRUE) %>%
unname()
}
k5files <- list.files("/hpcfs/users/a1647910/20200902_Psen2S4Ter/5_jellyfishFq/k5", pattern = "_dumps.txt", full.names = TRUE)
k5counts <- lapply(k5files, function(x){
read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
purrr::reduce(full_join) %>%
dplyr::select(mer, contains(c("WT", "Heter")))
k5dge <- k5counts %>%
as.data.frame() %>%
column_to_rownames("mer") %>%
DGEList() %>%
calcNormFactors()
k5dge$samples %<>%
rownames_to_column("rowname") %>%
mutate(sample = rowname) %>%
left_join(addInfo) %>%
column_to_rownames("rowname")
k5dge$samples$group <- colnames(k5dge) %>%
str_extract("(WT|Heter)") %>%
factor(levels = c("WT", "Heter"))
k5dist <- k5dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
ggplot(aes(x=count, colour = sample)) +
geom_density() +
labs(x = "intensity", title = "Distribution of 5-mers")
k5labels <- k5dge$samples %>%
mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>%
.$label
k5heat <- k5dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
t() %>%
pheatmap(silent = TRUE, cluster_cols = FALSE,
show_colnames = FALSE, fontsize = 9,
fontsize_row = 10, border_color = NA,
main = "5-mer counts heatmap", labels_row = k5labels)
k5heat$tree_row$labels <- k5labels
k5den <- ggdendrogram(k5heat$tree_row, rotate = TRUE) +
labs(title = "Hierarchical clustering of 5-mer counts") +
theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k5pca <- k5dge %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k5pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k5pcaPlot <- k5pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(k5dge$samples) %>%
ggplot(aes(PC1, PC2, colour = group, label = sample)) +
geom_point(alpha = 0.8, size = 3) +
geom_text_repel(show.legend = FALSE) +
labs(
x = paste0("PC1 (", percent(summary(k5pca)$importance[2, "PC1"]), ")"),
y = paste0("PC2 (", percent(summary(k5pca)$importance[2, "PC2"]), ")"),
colour = "Genotype",
title = "k = 5"
) +
scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k5design <- model.matrix(~rRNA, data = k5dge$samples)
k5voom <- voom(k5dge, design = k5design)
k5fit <- lmFit(k5voom, design = k5design)
k5eBayes <- eBayes(k5fit)
k5topTable <- k5eBayes %>%
topTable(coef = colnames(k5design)[2], sort.by = "p", n = Inf) %>%
set_colnames(str_remove(colnames(.), "ID\\.")) %>%
rownames_to_column("mer") %>%
mutate(DE = adj.P.Val < 0.05) %>%
dplyr::select(
mer,
AveExpr,
logFC,
P.Value,
FDR = adj.P.Val,
t,
DE,
everything(),
-B
) %>%
as_tibble()
k6files <- list.files("/hpcfs/users/a1647910/20200902_Psen2S4Ter/5_jellyfishFq/k6", pattern = "_dumps.txt", full.names = TRUE)
k6counts <- lapply(k6files, function(x){
read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
purrr::reduce(full_join) %>%
dplyr::select(mer, contains(c("WT", "Heter")))
k6dge <- k6counts %>%
as.data.frame() %>%
column_to_rownames("mer") %>%
DGEList() %>%
calcNormFactors()
k6dge$samples %<>%
rownames_to_column("rowname") %>%
mutate(sample = rowname) %>%
left_join(addInfo) %>%
column_to_rownames("rowname")
k6dge$samples$group <- colnames(k6dge) %>%
str_extract("(WT|Heter)") %>%
factor(levels = c("WT", "Heter"))
k6dist <- k6dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
ggplot(aes(x=count, colour = sample)) +
geom_density() +
labs(x = "intensity", title = "Distribution of 6-mers")
k6labels <- k6dge$samples %>%
mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>%
.$label
k6heat <- k6dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
t() %>%
pheatmap(silent = TRUE, cluster_cols = FALSE,
show_colnames = FALSE, fontsize = 9,
fontsize_row = 10, border_color = NA,
main = "6-mer counts heatmap", labels_row = k6labels)
k6heat$tree_row$labels <- k6labels
k6den <- ggdendrogram(k6heat$tree_row, rotate = TRUE) +
labs(title = "Hierarchical clustering of 6-mer counts") +
theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k6pca <- k6dge %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k6pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k6pcaPlot <- k6pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(k6dge$samples) %>%
ggplot(aes(PC1, PC2, colour = group, label = sample)) +
geom_point(alpha = 0.8, size = 3) +
geom_text_repel(show.legend = FALSE) +
labs(
x = paste0("PC1 (", percent(summary(k6pca)$importance[2, "PC1"]), ")"),
y = paste0("PC2 (", percent(summary(k6pca)$importance[2, "PC2"]), ")"),
colour = "Genotype",
title = "k = 6"
) +
scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k6design <- model.matrix(~rRNA, data = k6dge$samples)
k6voom <- voom(k6dge, design = k6design)
k6fit <- lmFit(k6voom, design = k6design)
k6eBayes <- eBayes(k6fit)
k6topTable <- k6eBayes %>%
topTable(coef = colnames(k6design)[2], sort.by = "p", n = Inf) %>%
set_colnames(str_remove(colnames(.), "ID\\.")) %>%
rownames_to_column("mer") %>%
mutate(DE = adj.P.Val < 0.05) %>%
dplyr::select(
mer,
AveExpr,
logFC,
P.Value,
FDR = adj.P.Val,
t,
DE,
everything(),
-B
) %>%
as_tibble()
k7files <- list.files("/hpcfs/users/a1647910/20200902_Psen2S4Ter/5_jellyfishFq/k7", pattern = "_dumps.txt", full.names = TRUE)
k7counts <- lapply(k7files, function(x){
read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
purrr::reduce(full_join) %>%
dplyr::select(mer, contains(c("WT", "Heter")))
k7dge <- k7counts %>%
as.data.frame() %>%
column_to_rownames("mer") %>%
DGEList() %>%
calcNormFactors()
k7dge$samples %<>%
rownames_to_column("rowname") %>%
mutate(sample = rowname) %>%
left_join(addInfo) %>%
column_to_rownames("rowname")
k7dge$samples$group <- colnames(k7dge) %>%
str_extract("(WT|Heter)") %>%
factor(levels = c("WT", "Heter"))
k7dist <- k7dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
ggplot(aes(x=count, colour = sample)) +
geom_density() +
labs(x = "intensity", title = "Distribution of 7-mers")
k7labels <- k7dge$samples %>%
mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>%
.$label
k7heat <- k7dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
t() %>%
pheatmap(silent = TRUE, cluster_cols = FALSE,
show_colnames = FALSE, fontsize = 9,
fontsize_row = 10, border_color = NA,
main = "7-mer counts heatmap", labels_row = k7labels)
k7heat$tree_row$labels <- k7labels
k7den <- ggdendrogram(k7heat$tree_row, rotate = TRUE) +
labs(title = "Hierarchical clustering of 7-mer counts") +
theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k7pca <- k7dge %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k7pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k7pcaPlot <- k7pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(k7dge$samples) %>%
ggplot(aes(PC1, PC2, colour = group, label = sample)) +
geom_point(alpha = 0.8, size = 3) +
geom_text_repel(show.legend = FALSE) +
labs(
x = paste0("PC1 (", percent(summary(k7pca)$importance[2, "PC1"]), ")"),
y = paste0("PC2 (", percent(summary(k7pca)$importance[2, "PC2"]), ")"),
colour = "Genotype",
title = "k = 7"
) +
scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k7design <- model.matrix(~rRNA, data = k7dge$samples)
k7voom <- voom(k7dge, design = k7design)
k7fit <- lmFit(k7voom, design = k7design)
k7eBayes <- eBayes(k7fit)
k7topTable <- k7eBayes %>%
topTable(coef = colnames(k7design)[2], sort.by = "p", n = Inf) %>%
set_colnames(str_remove(colnames(.), "ID\\.")) %>%
rownames_to_column("mer") %>%
mutate(DE = adj.P.Val < 0.05) %>%
dplyr::select(
mer,
AveExpr,
logFC,
P.Value,
FDR = adj.P.Val,
t,
DE,
everything(),
-B
) %>%
as_tibble()
k8files <- list.files("/hpcfs/users/a1647910/20200902_Psen2S4Ter/5_jellyfishFq/k8", pattern = "_dumps.txt", full.names = TRUE)
k8counts <- lapply(k8files, function(x){
read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
purrr::reduce(full_join) %>%
dplyr::select(mer, contains(c("WT", "Heter")))
k8dge <- k8counts %>%
as.data.frame() %>%
column_to_rownames("mer") %>%
DGEList() %>%
calcNormFactors()
k8dge$samples %<>%
rownames_to_column("rowname") %>%
mutate(sample = rowname) %>%
left_join(addInfo) %>%
column_to_rownames("rowname")
k8dge$samples$group <- colnames(k8dge) %>%
str_extract("(WT|Heter)") %>%
factor(levels = c("WT", "Heter"))
k8dist <- k8dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
ggplot(aes(x=count, colour = sample)) +
geom_density() +
labs(x = "intensity", title = "Distribution of 8-mers")
k8labels <- k8dge$samples %>%
mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>%
.$label
k8heat <- k8dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
t() %>%
pheatmap(silent = TRUE, cluster_cols = FALSE,
show_colnames = FALSE, fontsize = 9,
fontsize_row = 10, border_color = NA,
main = "8-mer counts heatmap", labels_row = k8labels)
k8heat$tree_row$labels <- k8labels
k8den <- ggdendrogram(k8heat$tree_row, rotate = TRUE) +
labs(title = "Hierarchical clustering of 8-mer counts") +
theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k8pca <- k8dge %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k8pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k8pcaPlot <- k8pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(k8dge$samples) %>%
ggplot(aes(PC1, PC2, colour = group, label = sample)) +
geom_point(alpha = 0.8, size = 3) +
geom_text_repel(show.legend = FALSE) +
labs(
x = paste0("PC1 (", percent(summary(k8pca)$importance[2, "PC1"]), ")"),
y = paste0("PC2 (", percent(summary(k8pca)$importance[2, "PC2"]), ")"),
colour = "Genotype",
title = "k = 8"
) +
scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k8design <- model.matrix(~rRNA, data = k8dge$samples)
k8voom <- voom(k8dge, design = k8design)
k8fit <- lmFit(k8voom, design = k8design)
k8eBayes <- eBayes(k8fit)
k8topTable <- k8eBayes %>%
topTable(coef = colnames(k8design)[2], sort.by = "p", n = Inf) %>%
set_colnames(str_remove(colnames(.), "ID\\.")) %>%
rownames_to_column("mer") %>%
mutate(DE = adj.P.Val < 0.05) %>%
dplyr::select(
mer,
AveExpr,
logFC,
P.Value,
FDR = adj.P.Val,
t,
DE,
everything(),
-B
) %>%
as_tibble()
k9files <- list.files("/hpcfs/users/a1647910/20200902_Psen2S4Ter/5_jellyfishFq/k9", pattern = "_dumps.txt", full.names = TRUE)
k9counts <- lapply(k9files, function(x){
read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
purrr::reduce(full_join) %>%
dplyr::select(mer, contains(c("WT", "Heter")))
k9dge <- k9counts %>%
as.data.frame() %>%
column_to_rownames("mer") %>%
DGEList() %>%
calcNormFactors()
k9dge$samples %<>%
rownames_to_column("rowname") %>%
mutate(sample = rowname) %>%
left_join(addInfo) %>%
column_to_rownames("rowname")
k9dge$samples$group <- colnames(k9dge) %>%
str_extract("(WT|Heter)") %>%
factor(levels = c("WT", "Heter"))
k9dist <- k9dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
ggplot(aes(x=count, colour = sample)) +
geom_density() +
labs(x = "intensity", title = "Distribution of 9-mers")
k9labels <- k9dge$samples %>%
mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>%
.$label
k9heat <- k9dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
t() %>%
pheatmap(silent = TRUE, cluster_cols = FALSE,
show_colnames = FALSE, fontsize = 9,
fontsize_row = 10, border_color = NA,
main = "9-mer counts heatmap", labels_row = k9labels)
k9heat$tree_row$labels <- k9labels
k9den <- ggdendrogram(k9heat$tree_row, rotate = TRUE) +
labs(title = "Hierarchical clustering of 9-mer counts") +
theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k9pca <- k9dge %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k9pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k9pcaPlot <- k9pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(k9dge$samples) %>%
ggplot(aes(PC1, PC2, colour = group, label = sample)) +
geom_point(alpha = 0.8, size = 3) +
geom_text_repel(show.legend = FALSE) +
labs(
x = paste0("PC1 (", percent(summary(k9pca)$importance[2, "PC1"]), ")"),
y = paste0("PC2 (", percent(summary(k9pca)$importance[2, "PC2"]), ")"),
colour = "Genotype",
title = "k = 9"
) +
scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k9design <- model.matrix(~rRNA, data = k9dge$samples)
k9voom <- voom(k9dge, design = k9design)
k9fit <- lmFit(k9voom, design = k9design)
k9eBayes <- eBayes(k9fit)
k9topTable <- k9eBayes %>%
topTable(coef = colnames(k9design)[2], sort.by = "p", n = Inf) %>%
set_colnames(str_remove(colnames(.), "ID\\.")) %>%
rownames_to_column("mer") %>%
mutate(DE = adj.P.Val < 0.05) %>%
dplyr::select(
mer,
AveExpr,
logFC,
P.Value,
FDR = adj.P.Val,
t,
DE,
everything(),
-B
) %>%
as_tibble()
k10files <- list.files("/hpcfs/users/a1647910/20200902_Psen2S4Ter/5_jellyfishFq/k10", pattern = "_dumps.txt", full.names = TRUE)
k10counts <- lapply(k10files, function(x){
read_delim(x, col_names = c("mer", basename(x)), delim = " ") %>%
set_colnames(str_remove_all(colnames(.), "_6month_F3|[0-9]*_Ps2Ex3M1_|_dumps\\.txt"))
}) %>%
purrr::reduce(full_join) %>%
dplyr::select(mer, contains(c("WT", "Heter")))
k10dge <- k10counts %>%
as.data.frame() %>%
column_to_rownames("mer") %>%
DGEList() %>%
calcNormFactors()
k10dge$samples %<>%
rownames_to_column("rowname") %>%
mutate(sample = rowname) %>%
left_join(addInfo) %>%
column_to_rownames("rowname")
k10dge$samples$group <- colnames(k10dge) %>%
str_extract("(WT|Heter)") %>%
factor(levels = c("WT", "Heter"))
k10dist <- k10dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
pivot_longer(everything(), names_to = "sample", values_to = "count") %>%
ggplot(aes(x=count, colour = sample)) +
geom_density() +
labs(x = "intensity", title = "Distribution of 10-mers")
k10labels <- k10dge$samples %>%
mutate(label = paste0(sample, "\n", percent(rRNA, accuracy = 0.01), " rRNA")) %>%
.$label
k10heat <- k10dge %>%
cpm(log = TRUE) %>%
as.data.frame() %>%
t() %>%
pheatmap(silent = TRUE, cluster_cols = FALSE,
show_colnames = FALSE, fontsize = 9,
fontsize_row = 10, border_color = NA,
main = "10-mer counts heatmap", labels_row = k10labels)
k10heat$tree_row$labels <- k10labels
k10den <- ggdendrogram(k10heat$tree_row, rotate = TRUE) +
labs(title = "Hierarchical clustering of 10-mer counts") +
theme(plot.title = element_text(size = 12))
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
k10pca <- k10dge %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(k10pca)$importance %>% pander(split.tables = Inf)
# Plot PCA
k10pcaPlot <- k10pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, PC1, PC2) %>%
left_join(k10dge$samples) %>%
ggplot(aes(PC1, PC2, colour = group, label = sample)) +
geom_point(alpha = 0.8, size = 3) +
geom_text_repel(show.legend = FALSE) +
labs(
x = paste0("PC1 (", percent(summary(k10pca)$importance[2, "PC1"]), ")"),
y = paste0("PC2 (", percent(summary(k10pca)$importance[2, "PC2"]), ")"),
colour = "Genotype",
title = "k = 10"
) +
scale_colour_discrete(labels = c("Wildtype", "Mutant"))
k10design <- model.matrix(~rRNA, data = k10dge$samples)
k10voom <- voom(k10dge, design = k10design)
k10fit <- lmFit(k10voom, design = k10design)
k10eBayes <- eBayes(k10fit)
k10topTable <- k10eBayes %>%
topTable(coef = colnames(k10design)[2], sort.by = "p", n = Inf) %>%
set_colnames(str_remove(colnames(.), "ID\\.")) %>%
rownames_to_column("mer") %>%
mutate(DE = adj.P.Val < 0.05) %>%
dplyr::select(
mer,
AveExpr,
logFC,
P.Value,
FDR = adj.P.Val,
t,
DE,
everything(),
-B
) %>%
as_tibble()
ggarrange(
k5dist, k6dist, k7dist, k8dist, k9dist, k10dist,
ncol = 2, nrow = 3, common.legend = TRUE, legend = "bottom"
)
ggarrange(
k5pcaPlot, k6pcaPlot, k7pcaPlot, k8pcaPlot, k9pcaPlot, k10pcaPlot,
ncol = 2, nrow = 3, common.legend = TRUE, legend = "bottom"
)
ggarrange(
k5den, k6den, k7den, k8den, k9den, k10den,
ncol = 2, nrow = 3, common.legend = TRUE, legend = "bottom"
)
topRes <- function(x, cap){
x %>%
dplyr::select(mer, AveExpr, logFC, P.Value, FDR, DE) %>%
mutate(
AveExpr = format(round(AveExpr, 2), nsmall = 2),
logFC = format(round(logFC, 2), nsmall = 2),
P.Value = sprintf("%.2e", P.Value),
FDR = sprintf("%.2e", FDR)
) %>%
dplyr::slice(1:100) %>%
datatable(
options = list(pageLength = 10),
class = "striped hover condensed responsive",
filter = "top",
caption = cap
)
}
topRes(k5topTable,
cap = paste(
"The top 100 differentially expressed 5-mers.",
nrow(dplyr::filter(k5topTable, DE)),
"of",
nrow(k5topTable),
"detected sequences were classified as DE with an FDR < 0.05."
)
)
k5topTable %>%
ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
geom_point(alpha = 0.5) +
scale_colour_manual(values = c("black", "grey50", "red")) +
geom_text_repel(
data = . %>%
dplyr::filter(DE) %>%
dplyr::filter(-log10(P.Value) > 4 | logFC > 4 | logFC < -2),
aes(label = mer, color = "black")
) +
labs(x = "logFC", y = expression(paste(-log[10], "(p)"))) +
theme_bw() +
theme(legend.position = "none")
topRes(k6topTable,
cap = paste(
"The top 100 differentially expressed 6-mers.",
nrow(dplyr::filter(k6topTable, DE)),
"of",
nrow(k6topTable),
"detected sequences were classified as DE with an FDR < 0.05."
)
)
k6topTable %>%
ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
geom_point(alpha = 0.5) +
scale_colour_manual(values = c("black", "grey50", "red")) +
geom_text_repel(
data = . %>%
dplyr::filter(DE) %>%
dplyr::filter(-log10(P.Value) > 5 | logFC > 7 | logFC < -2.5),
aes(label = mer, color = "black")
) +
labs(x = "logFC", y = expression(paste(-log[10], "(p)"))) +
theme_bw() +
theme(legend.position = "none")
topRes(k7topTable,
cap = paste(
"The top 100 differentially expressed 7-mers.",
nrow(dplyr::filter(k7topTable, DE)),
"of",
nrow(k7topTable),
"detected sequences were classified as DE with an FDR < 0.05."
)
)
k7topTable %>%
ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
geom_point(alpha = 0.5) +
scale_colour_manual(values = c("black", "grey50", "red")) +
geom_text_repel(
data = . %>%
dplyr::filter(DE) %>%
dplyr::filter(-log10(P.Value) > 7 | logFC > 11 | logFC < -5),
aes(label = mer, color = "black")
) +
labs(x = "logFC", y = expression(paste(-log[10], "(p)"))) +
theme_bw() +
theme(legend.position = "none")
topRes(k8topTable,
cap = paste(
"The top 100 differentially expressed 8-mers.",
nrow(dplyr::filter(k8topTable, DE)),
"of",
nrow(k8topTable),
"detected sequences were classified as DE with an FDR < 0.05."
)
)
k8topTable %>%
ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
geom_point(alpha = 0.5) +
scale_colour_manual(values = c("black", "grey50", "red")) +
geom_text_repel(
data = . %>%
dplyr::filter(DE) %>%
dplyr::filter(-log10(P.Value) > 7 | logFC > 20 | logFC < -10),
aes(label = mer, color = "black")
) +
labs(x = "logFC", y = expression(paste(-log[10], "(p)"))) +
theme_bw() +
theme(legend.position = "none")
topRes(k9topTable,
cap = paste(
"The top 100 differentially expressed 9-mers.",
nrow(dplyr::filter(k9topTable, DE)),
"of",
nrow(k9topTable),
"detected sequences were classified as DE with an FDR < 0.05."
)
)
k9topTable %>%
ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
geom_point(alpha = 0.5) +
scale_colour_manual(values = c("black", "grey50", "red")) +
geom_text_repel(
data = . %>%
dplyr::filter(DE) %>%
dplyr::filter(-log10(P.Value) > 7 | logFC > 35 | logFC < -14),
aes(label = mer, color = "black")
) +
labs(x = "logFC", y = expression(paste(-log[10], "(p)"))) +
theme_bw() +
theme(legend.position = "none")
topRes(k10topTable,
cap = paste(
"The top 100 differentially expressed 10-mers.",
nrow(dplyr::filter(k10topTable, DE)),
"of",
nrow(k10topTable),
"detected sequences were classified as DE with an FDR < 0.05."
)
)
k10topTable %>%
ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
geom_point(alpha = 0.5) +
scale_colour_manual(values = c("black", "grey50", "red")) +
geom_text_repel(
data = . %>%
dplyr::filter(DE) %>%
dplyr::filter(-log10(P.Value) > 8 | logFC > 45 | logFC < -17),
aes(label = mer, color = "black")
) +
labs(x = "logFC", y = expression(paste(-log[10], "(p)"))) +
theme_bw() +
theme(legend.position = "none")